DGN-WB Elastic Net 10-fold CV Results for gene expression prediction (SNPs within 1Mb of each gene)
Compare alpha=0.5 to alpha=1
library(ggplot2)
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
alpha0.5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha0.5_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
dim(alpha1)
## [1] 13171 8
dim(alpha0.5)
## [1] 13171 8
en <- inner_join(alpha1,alpha0.5,by="gene")
with(en,plot(R2.x,R2.y,xlab='alpha=1 R2',ylab='alpha=0.5 R2',xlim=c(0,1),ylim=c(0,1),main='E-N DGN-WB all 13K protein coding genes',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(en,t.test(R2.x>R2.y))
##
## One Sample t-test
##
## data: R2.x > R2.y
## t = 111.7318, df = 11232, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.5171606 0.5356303
## sample estimates:
## mean of x
## 0.5263954
Compare polyscore thresholds
"%&%" = function(a,b) paste(a,b,sep="")
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps3 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.001_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps2 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.01_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.5_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps4 <- ps4 %>% select(gene,CV.R2)
ps3 <- ps3 %>% select(gene,CV.R2)
ps2 <- ps2 %>% select(gene,CV.R2)
ps05 <- ps05 %>% select(gene,CV.R2)
ps5 <- ps5 %>% select(gene,CV.R2)
ps1 <- ps1 %>% select(gene,CV.R2)
all <- inner_join(ps4,ps3,by='gene')
all <- inner_join(all,ps2,by='gene')
all <- inner_join(all,ps05,by='gene')
all <- inner_join(all,ps5,by='gene')
all <- inner_join(all,ps1,by='gene')
colnames(all) <- c('gene',0.0001,0.001,0.01,0.05,0.5,1)
all <- all[complete.cases(all),]
all_long <- melt(all,by=gene)
## Using gene as id variables
ngenes<-dim(all)[1]
p <- ggplot(all_long, aes(x = -log10(as.numeric(levels(variable))[variable]), y = value), group=gene) + geom_line(lwd = 0.5, show_guide = FALSE) + aes(color = gene) + xlab("-log10(p) threshold") + ylab("R2") + ggtitle("Polyscore DGN-WB (" %&% ngenes %&% " protein coding genes)")
print(p)

p <- ggplot(all_long, aes(x = -log10(as.numeric(levels(variable))[variable]), y = value), group=gene) + geom_line(lwd = 0.5, show_guide = FALSE) + aes(color = gene) + xlab("-log10(p) threshold") + ylab("R2") + ggtitle("Polyscore DGN-WB (protein coding genes with R2>0.5)") + coord_cartesian(ylim=c(0.5,0.9))
print(p)

Compare polyscore to E-N
"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
all <- inner_join(alpha1,ps4,by='gene')
all <- inner_join(all,ps05,by='gene')
with(all,plot(R2,CV.R2.x,xlab='LASSO (alpha=1) R2',ylab='Polyscore P < 0.0001 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(R2>CV.R2.x))
##
## One Sample t-test
##
## data: R2 > CV.R2.x
## t = 235.2103, df = 10428, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.8343917 0.8484159
## sample estimates:
## mean of x
## 0.8414038
with(all,plot(R2,CV.R2.y,xlab='LASSO (alpha=1) R2',ylab='Polyscore P < 0.05 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(R2>CV.R2.y))
##
## One Sample t-test
##
## data: R2 > CV.R2.y
## t = 213.8756, df = 11551, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.7910726 0.8057071
## sample estimates:
## mean of x
## 0.7983899
with(all,plot(CV.R2.x,CV.R2.y,xlab='Polyscore P < 0.0001 R2',ylab='Polyscore P < 0.05 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(CV.R2.x>CV.R2.y))
##
## One Sample t-test
##
## data: CV.R2.x > CV.R2.y
## t = 128.6931, df = 10493, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.6028352 0.6214834
## sample estimates:
## mean of x
## 0.6121593
ps2 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.01_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
all <- inner_join(all,ps2,by='gene')
with(all,plot(CV.R2.x,CV.R2,xlab='Polyscore P < 0.0001 R2',ylab='Polyscore P < 0.01 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(CV.R2.x>CV.R2))
##
## One Sample t-test
##
## data: CV.R2.x > CV.R2
## t = 113.1659, df = 10493, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.5401268 0.5591681
## sample estimates:
## mean of x
## 0.5496474
Compare n.snps per model
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
alpha0.5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha0.5_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
alpha1 <- alpha1 %>% select(gene,n.snps)
alpha0.5 <- alpha0.5 %>% select(gene,n.snps)
ps4 <- ps4 %>% select(gene,n.snps)
ps05 <- ps05 %>% select(gene,n.snps)
all <- inner_join(alpha1,alpha0.5,by='gene')
all <- inner_join(all,ps4,by='gene')
all <- inner_join(all,ps05,by='gene')
colnames(all) <- c('gene','EN.a=1','EN.a=0.5','PS.e-04','PS.0.05')
all_long <- melt(all,by=gene)
## Using gene as id variables
p <- ggplot(all_long, aes(x = variable, y = value))
p + geom_boxplot() + ggtitle("Number of SNPs in predictive model") + xlab("model") + ylab("n SNPs")
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

p <- ggplot(all_long, aes(x = variable, y = value))
p + geom_boxplot() + ggtitle("Number of SNPs in predictive model") + xlab("model") + ylab("n SNPs") + coord_cartesian(ylim=c(0,250))
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Compare LASSO R2 to gcta local h2
"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
all <- inner_join(alpha1,h2,by='gene')
data <- all %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se) ) %>% arrange(h2)
data <- data[complete.cases(data),]
p1<-ggplot(data,aes(x=1:nrow(data),y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()
p1 + geom_point(data=data,aes(x=1:nrow(data),y=R2),color='red',cex=0.5) + xlab("Genes sorted by h2") + ylab("h2 (black) or R2 (red)") + ggtitle('DGN-WB Local h2 (black) and LASSO 10-fold CV R2 (red)')

Compare polyscore R2 to gcta local h2
"%&%" = function(a,b) paste(a,b,sep="")
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
all <- inner_join(ps4,h2,by='gene')
data <- all %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se) ) %>% arrange(h2)
data <- data[complete.cases(data),]
p1<-ggplot(data,aes(x=1:nrow(data),y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()
p1 + geom_point(data=data,aes(x=1:nrow(data),y=CV.R2),color='red',cex=0.5) + xlab("Genes sorted by h2") + ylab("h2 (black) or R2 (red)") + ggtitle('DGN-WB Local h2 (black) and Polyscore (P<e-04) 10-fold CV R2 (red)')

##calc prop genes with h2>0.1
a<-h2$h2>0.1
table(a)/sum(table(a))
## a
## FALSE TRUE
## 0.6281506 0.3718494
##calc prop genes with h2>0.01
a<-h2$h2>0.01
table(a)/sum(table(a))
## a
## FALSE TRUE
## 0.1851553 0.8148447
Compare gcta local h2 to mean expression level
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
exp <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.mean.median.expression.2015-02-23.txt',header=T)
all <- inner_join(h2,exp,by='gene')
dim(all)
## [1] 13648 6
ggplot(all,aes(x=h2,y=expmeans)) + geom_point() + geom_smooth(method = "lm") + xlab("Local h2") + ylab("Mean expression") + ggtitle("DGN-WB (each point is a gene)")

summary(lm(expmeans~h2,all))
##
## Call:
## lm(formula = expmeans ~ h2, data = all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.117e-03 -1.898e-04 -1.300e-07 1.889e-04 1.095e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.225e-05 3.114e-06 -3.935 8.37e-05 ***
## h2 1.798e-05 1.565e-05 1.149 0.251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002864 on 13646 degrees of freedom
## Multiple R-squared: 9.673e-05, Adjusted R-squared: 2.345e-05
## F-statistic: 1.32 on 1 and 13646 DF, p-value: 0.2506
##No relationship between mean expression and h2
ggplot(all,aes(x=h2,y=expmedians)) + geom_point() + geom_smooth(method = "lm") + xlab("Local h2") + ylab("Median expression") + ggtitle("DGN-WB (each point is a gene)")

summary(lm(expmedians~h2,all))
##
## Call:
## lm(formula = expmedians ~ h2, data = all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55834 -0.02875 0.00293 0.03249 0.54593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0307528 0.0006562 46.865 < 2e-16 ***
## h2 -0.0158342 0.0032970 -4.803 1.58e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06034 on 13646 degrees of freedom
## Multiple R-squared: 0.001687, Adjusted R-squared: 0.001614
## F-statistic: 23.06 on 1 and 13646 DF, p-value: 1.583e-06
##Lower median expression has a larger h2
toph2 <- all[all[,2]>0.1,]
dim(toph2)
## [1] 5075 6
ggplot(toph2,aes(x=h2,y=expmedians)) + geom_point() + geom_smooth(method = "lm") + xlab("Local h2") + ylab("Median expression") + ggtitle("DGN-WB h2>0.1 (each point is a gene)")

summary(lm(expmedians~h2,toph2))
##
## Call:
## lm(formula = expmedians ~ h2, data = toph2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51324 -0.03158 0.00332 0.03658 0.49316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.031877 0.002092 15.239 < 2e-16 ***
## h2 -0.018384 0.006507 -2.825 0.00474 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07698 on 5073 degrees of freedom
## Multiple R-squared: 0.001571, Adjusted R-squared: 0.001374
## F-statistic: 7.982 on 1 and 5073 DF, p-value: 0.004742
##Lower median expression has a larger h2